jupyter: python3 title: embedding eda¶
import altair as alt
import os
import umap
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import pdist
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.patches as mpatches
from CryptoFraudDetection.utils import embedding
from CryptoFraudDetection.utils import enums
from CryptoFraudDetection.utils import logger
alt.themes.enable("dark")
ThemeRegistry.enable('dark')
df = pd.read_parquet("../data/processed/reddit_posts.parquet")
df.head(5)
| id | parent_id | author | body | created | depth | edited | score | search_query | subreddit | title | url | num_comments | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | yxu5tv | <NA> | magus-21 | Secretly lending customer funds, market-making... | 2022-11-17 16:10:14 | -1 | <NA> | 1597 | Safe Moon | r/CryptoCurrency | "DYOR" is worthless. You can't "Do Your Own Re... | https://www.reddit.com/r/CryptoCurrency/commen... | 634 |
| 1 | iwqji72 | yxu5tv | Hank___Scorpio | Some peoples research led them to safemoon. | 2022-11-17 16:34:34 | 0 | <NA> | 291 | Safe Moon | r/CryptoCurrency | <NA> | <NA> | <NA> |
| 2 | iwqnjit | iwqji72 | getoffthepitch96576 | I remember the whitepaper was a power point pr... | 2022-11-17 17:01:23 | 1 | <NA> | 88 | Safe Moon | r/CryptoCurrency | <NA> | <NA> | <NA> |
| 3 | iwqye4z | iwqnjit | TheTrueBlueTJ | Not to mention the shitfountain that was their... | 2022-11-17 18:11:55 | 2 | <NA> | 16 | Safe Moon | r/CryptoCurrency | <NA> | <NA> | <NA> |
| 4 | iwsv07b | iwqnjit | joikhuu | Still better than bitconnect's. | 2022-11-18 02:20:20 | 2 | <NA> | 2 | Safe Moon | r/CryptoCurrency | <NA> | <NA> | <NA> |
df["search_query"].value_counts()
search_query Bitcoin 142163 Chainlink 72875 Ethereum 72409 Safe Moon 69582 Cosmos 57082 Avalanche 35673 FTX Token 22597 THORChain 18772 Terra Luna 8984 BitForex 2761 BeerCoin 819 Teddy Doge 738 Name: count, dtype: Int64
subreddit_query = (
df.groupby(["search_query", "subreddit"])
.size()
.unstack()
.fillna(0)
.astype(int)
.reset_index()
.melt(id_vars="search_query")
)
c = (
alt.Chart(subreddit_query)
.mark_bar()
.encode(
x=alt.X("value:Q", title="Number of Comments", scale={"domain": [0, 152_000]}),
y=alt.Y("search_query:O", title="Coin"),
color=alt.Color("subreddit:N", title="Subreddit"),
tooltip=[
alt.Tooltip("search_query:O", title="Coin"),
alt.Tooltip("subreddit:N", title="Subreddit"),
alt.Tooltip("value:Q", title="# of Comments"),
],
)
.properties(
width=800,
height=400,
title=alt.Title(text="Number of Comments per Subreddit"),
)
)
text = (
alt.Chart(subreddit_query)
.mark_text(align="left", dx=5, color="white")
.transform_calculate(customtooltip="datum.value")
.encode(
x=alt.X("sum(value):Q", scale={"domain": [0, 152_000]}),
y=alt.Y("search_query:O"),
text=alt.Text("sum(value):Q"),
)
)
c + text
subreddit_query_normalized = subreddit_query.copy()
posts_per_coin = subreddit_query_normalized.groupby("search_query")["value"].transform(
"sum"
)
subreddit_query_normalized["value"] /= posts_per_coin
c = (
alt.Chart(subreddit_query_normalized)
.mark_rect()
.encode(
y=alt.Y("subreddit:N", title="Subreddit"),
x=alt.X("search_query:O", title="Coin"),
color=alt.Color(
"value:Q",
title="Number of Comments",
scale=alt.Scale(
range=["#ffffff", "#deecfb", "#bedaf7", "#7ab3ef", "#368ce7", "#1666ba"]
),
legend=None,
),
tooltip=[
alt.Tooltip("search_query:O", title="Coin"),
alt.Tooltip("subreddit:N", title="Subreddit"),
alt.Tooltip("value:Q", title=r"% of Comments", format=".4%"),
],
)
.properties(
width=800,
height=800,
title=alt.Title(text="Number of Comments per Subreddit (Normalized by Coin)"),
)
)
text = (
alt.Chart(subreddit_query_normalized)
.mark_text(align="center")
.transform_calculate(customtooltip="datum.value")
.encode(
y=alt.Y("subreddit:N"),
x=alt.X("search_query:O"),
text=alt.Text("value:Q", format=".2%"),
color=alt.condition(
alt.datum.value >= 0.5, alt.value("white"), alt.value("black")
),
tooltip=[
alt.Tooltip("search_query:O", title="Coin"),
alt.Tooltip("subreddit:N", title="Subreddit"),
alt.Tooltip("value:Q", title=r"% of Comments", format=".4%"),
],
)
)
c + text
subreddit_query_posts = (
df.query("depth == -1")
.groupby(["search_query", "subreddit"])
.size()
.unstack()
.fillna(0)
.astype(int)
.reset_index()
.melt(id_vars="search_query")
)
c = (
alt.Chart(subreddit_query_posts)
.mark_bar()
.encode(
x=alt.X("value:Q", title="Number of Comments", scale={"domain": [0, 1_400]}),
y=alt.Y("search_query:O", title="Coin"),
color=alt.Color("subreddit:N", title="Subreddit"),
tooltip=[
alt.Tooltip("search_query:O", title="Coin"),
alt.Tooltip("subreddit:N", title="Subreddit"),
alt.Tooltip("value:Q", title="# of Comments"),
],
)
.properties(
width=800,
height=400,
title=alt.Title(text="Number of Posts per Subreddit"),
)
)
text = (
alt.Chart(subreddit_query_posts)
.mark_text(align="left", dx=5, color="white")
.transform_calculate(customtooltip="datum.value")
.encode(
x=alt.X("sum(value):Q", scale={"domain": [0, 1_400]}),
y=alt.Y("search_query:O"),
text=alt.Text("sum(value):Q"),
)
)
c + text
subreddit_query_posts_normalized = subreddit_query_posts.copy()
posts_per_coin = subreddit_query_posts_normalized.groupby("search_query")[
"value"
].transform("sum")
subreddit_query_posts_normalized["value"] /= posts_per_coin
c = (
alt.Chart(subreddit_query_posts_normalized)
.mark_rect()
.encode(
y=alt.Y("subreddit:N", title="Subreddit"),
x=alt.X("search_query:O", title="Coin"),
color=alt.Color(
"value:Q",
title="Number of Posts",
scale=alt.Scale(
range=["#ffffff", "#deecfb", "#bedaf7", "#7ab3ef", "#368ce7", "#1666ba"]
),
legend=None,
),
tooltip=[
alt.Tooltip("search_query:O", title="Coin"),
alt.Tooltip("subreddit:N", title="Subreddit"),
alt.Tooltip("value:Q", title=r"% of Posts", format=".4%"),
],
)
.properties(
width=800,
height=800,
title=alt.Title(text="Number of Posts per Subreddit (Normalized by Coin)"),
)
)
text = (
alt.Chart(subreddit_query_posts_normalized)
.mark_text(align="center")
.transform_calculate(customtooltip="datum.value")
.encode(
y=alt.Y("subreddit:N"),
x=alt.X("search_query:O"),
text=alt.Text("value:Q", format=".2%"),
color=alt.condition(
alt.datum.value > 0.7, alt.value("white"), alt.value("black")
),
tooltip=[
alt.Tooltip("search_query:O", title="Coin"),
alt.Tooltip("subreddit:N", title="Subreddit"),
alt.Tooltip("value:Q", title=r"% of Posts", format=".4%"),
],
)
)
c + text
depth_df = df["depth"].value_counts().sort_index()
c = (
alt.Chart(depth_df.reset_index())
.mark_bar()
.encode(
x=alt.X("depth:O", title="Depth (-1 is the original post)"),
y=alt.Y("count:Q", title="Number of Comments", scale={"domain": [0, 240_000]}),
tooltip=[
alt.Tooltip("depth:O", title="Depth"),
alt.Tooltip("count:Q", title="# of Comments"),
],
)
.properties(
width=800,
height=400,
title=alt.Title(text="Number of Comments per Depth"),
)
)
text = (
alt.Chart(depth_df.reset_index())
.mark_text(align="center", dy=-8, color="white")
.transform_calculate(customtooltip="datum.count")
.encode(
x=alt.X("depth:O"),
y=alt.Y("count:Q", scale={"domain": [0, 240_000]}),
text=alt.Text("count:Q"),
)
)
c + text
comments_per_user = df.groupby("author").size()
distribution = comments_per_user.value_counts().sort_index()
distribution_df = distribution.reset_index()
distribution_df.columns = ["comments_per_user", "number_of_users"]
c = (
alt.Chart(distribution_df)
.mark_bar()
.encode(
x=alt.X(
"comments_per_user:O",
title="Number of Comments per User",
scale={"domain": range(1, 101)},
),
y=alt.Y(
"number_of_users:Q",
title="Number of Users",
scale=alt.Scale(
type="symlog",
domain=[0, 100_000],
),
axis=alt.Axis(
values=[
0,
1,
2,
5,
10,
20,
50,
100,
200,
500,
1_000,
2_000,
5_000,
10_000,
20_000,
50_000,
100_000,
]
),
),
tooltip=[
alt.Tooltip("comments_per_user:Q", title="# of Comments per User"),
alt.Tooltip("number_of_users:Q", title="# of Users"),
],
)
.properties(
width=1000,
height=400,
title=alt.Title(text="Number of Comments per User"),
)
)
c
comments_per_user.sort_values(ascending=False).head(10)
author None 39167 donut-bot 8664 AutoModerator 4385 kirtash93 1969 CointestMod 1219 Every_Hunt_160 1005 Objective_Digit 973 goldyluckinblokchain 708 partymsl 628 coinfeeds-bot 626 dtype: int64
posts_per_user = df.query("depth == -1").groupby("author").size()
distribution = posts_per_user.value_counts().sort_index()
distribution_df = distribution.reset_index()
distribution_df.columns = ["posts_per_user", "number_of_users"]
c = (
alt.Chart(distribution_df)
.mark_bar()
.encode(
x=alt.X(
"posts_per_user:O",
title="Number of Posts per User",
scale={"domain": range(1, 101)},
),
y=alt.Y(
"number_of_users:Q",
title="Number of Posts",
scale=alt.Scale(
type="symlog",
domain=[0, 5_000],
zero=True,
),
axis=alt.Axis(
values=[
0,
1,
2,
5,
10,
20,
50,
100,
200,
500,
1_000,
2_000,
5_000,
]
),
),
tooltip=[
alt.Tooltip("posts_per_user:Q", title="# of Posts per User"),
alt.Tooltip("number_of_users:Q", title="# of Users"),
],
)
.properties(
width=1000,
height=400,
title=alt.Title(text="Number of Posts per User"),
)
)
c
posts_per_user.sort_values(ascending=False).head(10)
author None 382 kirtash93 63 goldyluckinblokchain 52 hiorea 46 InclineDumbbellPress 41 Odd-Radio-8500 33 SigiNwanne 27 partymsl 23 Ok_Source4689 22 Creative_Ad7831 16 dtype: int64
posts_per_coin_per_date = df.copy()
posts_per_coin_per_date["created"] = (
pd.to_datetime(posts_per_coin_per_date["created"])
.dt.to_period("M")
.dt.to_timestamp()
)
posts_per_coin_per_date = posts_per_coin_per_date.query(
"created >= '2020-01-01'"
).query("depth == -1")
posts_per_coin_per_date = (
posts_per_coin_per_date.groupby(["search_query", "created"])["id"]
.count()
.reset_index()
.rename(columns={"id": "number_of_posts"})
)
posts_per_coin_per_date["number_of_posts_rel"] = posts_per_coin_per_date[
"number_of_posts"
] / posts_per_coin_per_date.groupby("search_query")["number_of_posts"].transform("sum")
posts_per_coin_per_date
step = 40
overlap = 1
c = (
alt.Chart(posts_per_coin_per_date, height=step)
.mark_area(
interpolate="monotone",
fillOpacity=0.5,
stroke="lightgray",
strokeWidth=0.5,
)
.encode(
x=alt.X("created:T").title("Date").axis(grid=False),
y=alt.Y("number_of_posts_rel:Q")
.axis(None)
.scale(range=[step, -step * overlap]),
color=alt.Color("search_query:N", legend=None),
tooltip=[
alt.Tooltip("yearmonth(created):T", title="Date"),
alt.Tooltip("search_query:N", title="Coin"),
alt.Tooltip("number_of_posts:Q", title="# of Posts"),
],
)
.properties(
width=800,
height=step,
)
.facet(
row=alt.Row("search_query:N")
.title(None)
.header(labelAngle=0, labelAlign="left")
)
.properties(
title=alt.Title(
text="Number of Posts per Coin per Date after 2020",
anchor="middle",
),
bounds="flush",
)
.configure_facet(spacing=0)
.configure_view(stroke=None)
.configure_title(anchor="end")
)
c
reddit_parquet = "../data/processed/reddit_embedded.parquet"
df = pd.read_parquet(reddit_parquet)
print("DataFrame erfolgreich geladen.")
DataFrame erfolgreich geladen.
df.head(5)
| id | parent_id | author | body | created | depth | edited | score | search_query | subreddit | title | url | num_comments | embedded_text | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | yxu5tv | <NA> | magus-21 | Secretly lending customer funds, market-making... | 2022-11-17 16:10:14 | -1 | <NA> | 1597 | Safe Moon | r/CryptoCurrency | "DYOR" is worthless. You can't "Do Your Own Re... | https://www.reddit.com/r/CryptoCurrency/commen... | 634 | [-0.28282067, -0.2284037, 0.04313869, 0.076180... |
| 1 | iwqji72 | yxu5tv | Hank___Scorpio | Some peoples research led them to safemoon. | 2022-11-17 16:34:34 | 0 | <NA> | 291 | Safe Moon | r/CryptoCurrency | <NA> | <NA> | <NA> | [-0.4349563, -0.56824857, -0.2638307, 0.357553... |
| 2 | iwqnjit | iwqji72 | getoffthepitch96576 | I remember the whitepaper was a power point pr... | 2022-11-17 17:01:23 | 1 | <NA> | 88 | Safe Moon | r/CryptoCurrency | <NA> | <NA> | <NA> | [-0.3600267, -0.21511783, 0.08681131, 0.751270... |
| 3 | iwqye4z | iwqnjit | TheTrueBlueTJ | Not to mention the shitfountain that was their... | 2022-11-17 18:11:55 | 2 | <NA> | 16 | Safe Moon | r/CryptoCurrency | <NA> | <NA> | <NA> | [-0.077001244, -0.4535337, 0.20802015, 0.74446... |
| 4 | iwsv07b | iwqnjit | joikhuu | Still better than bitconnect's. | 2022-11-18 02:20:20 | 2 | <NA> | 2 | Safe Moon | r/CryptoCurrency | <NA> | <NA> | <NA> | [-0.44820178, -0.32241026, 0.13388954, 0.76202... |
df.columns
Index(['id', 'parent_id', 'author', 'body', 'created', 'depth', 'edited',
'score', 'search_query', 'subreddit', 'title', 'url', 'num_comments',
'embedded_text'],
dtype='object')
coin_test = ['FTX Token', 'Safe Moon', 'Ethereum', 'Cosmos']
#cut out test coins
df = df[~df['search_query'].isin(coin_test)]
with open('../data/raw/coins.json', 'r') as f:
coins_data = json.load(f)
coins_info_df = pd.DataFrame(coins_data)
merged_df = df.merge(coins_info_df, left_on="search_query", right_on="name", how="left")
embeddings = np.vstack(merged_df["embedded_text"].values)
fraud_labels = merged_df["fraud"]
pca = PCA(n_components=2)
embeddings_pca = pca.fit_transform(embeddings)
umap_reducer = umap.UMAP(n_components=2, random_state=42, metric='cosine')
embeddings_umap = umap_reducer.fit_transform(embeddings)
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
colors = ['red' if fraud else 'blue' for fraud in fraud_labels]
axes[0].scatter(embeddings_pca[:, 0], embeddings_pca[:, 1], c=colors, alpha=0.25, edgecolor='k')
axes[0].set_title("PCA of Embeddings")
axes[0].set_xlabel("PCA Component 1")
axes[0].set_ylabel("PCA Component 2")
axes[1].scatter(embeddings_umap[:, 0], embeddings_umap[:, 1], c=colors, alpha=0.15, edgecolor='k')
axes[1].set_title("UMAP of Embeddings")
axes[1].set_xlabel("UMAP Dimension 1")
axes[1].set_ylabel("UMAP Dimension 2")
fraud_legend = [
plt.Line2D([0], [0], marker='o', color='w', label='Fraud', markerfacecolor='red', markersize=10),
plt.Line2D([0], [0], marker='o', color='w', label='Non-Fraud', markerfacecolor='blue', markersize=10)
]
fig.legend(handles=fraud_legend, loc="upper right")
plt.tight_layout()
plt.show()
C:\Users\flori\Documents\BSc_Data-Science_FHNW\Module\CX5\venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
Here we can see the embeddings plottet as pca and umap top 2 componnents. The color represents if the posts was about a scam or non scam coin.There are clusters of Scam and non-Scam Embeddings visible, but it could be that these are just scams about the same coin.
coin_labels = merged_df["search_query"].values
unique_coins = np.unique(coin_labels)
coin_colors = {coin: plt.cm.tab10(i / len(unique_coins)) for i, coin in enumerate(unique_coins)}
colors = [coin_colors[coin] for coin in coin_labels]
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
axes[0].scatter(embeddings_pca[:, 0], embeddings_pca[:, 1], c=colors, alpha=0.1, edgecolor='k')
axes[0].set_title("PCA of Embeddings")
axes[0].set_xlabel("PCA Component 1")
axes[0].set_ylabel("PCA Component 2")
axes[1].scatter(embeddings_umap[:, 0], embeddings_umap[:, 1], c=colors, alpha=0.1, edgecolor='k')
axes[1].set_title("UMAP of Embeddings")
axes[1].set_xlabel("UMAP Dimension 1")
axes[1].set_ylabel("UMAP Dimension 2")
legend_elements = [
plt.Line2D([0], [0], marker='o', color='w', label=coin, markerfacecolor=coin_colors[coin], markersize=10)
for coin in unique_coins
]
fig.legend(handles=legend_elements, loc="upper right", title="Coins")
plt.tight_layout()
plt.show()
Here we can see the same plot but with the color representing the coin. Bitcoin is the most common coin in the dataset and therefore the most visible. It is hard to intrepred something, because the posts of the bitcoin are so dominant.maybe there are some cluster but there are so many data points that it is also hard to see.
unique_keywords = df['search_query'].unique()
for keyword in unique_keywords:
keyword_df = df[df['search_query'] == keyword]
embeddings = np.vstack(keyword_df['embedded_text'].values)
pca = PCA(n_components=2)
embeddings_pca = pca.fit_transform(embeddings)
umap_reducer = umap.UMAP(n_components=2, metric='cosine', random_state=42)
embeddings_umap = umap_reducer.fit_transform(embeddings)
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
axes[0].scatter(embeddings_pca[:, 0], embeddings_pca[:, 1], alpha=0.7, edgecolor='k')
axes[0].set_title(f"PCA of Embeddings for {keyword}")
axes[0].set_xlabel("PCA Component 1")
axes[0].set_ylabel("PCA Component 2")
axes[1].scatter(embeddings_umap[:, 0], embeddings_umap[:, 1], alpha=0.7, edgecolor='k')
axes[1].set_title(f"UMAP of Embeddings for {keyword}")
axes[1].set_xlabel("UMAP Dimension 1")
axes[1].set_ylabel("UMAP Dimension 2")
plt.tight_layout()
plt.show()
C:\Users\flori\Documents\BSc_Data-Science_FHNW\Module\CX5\venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
C:\Users\flori\Documents\BSc_Data-Science_FHNW\Module\CX5\venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
C:\Users\flori\Documents\BSc_Data-Science_FHNW\Module\CX5\venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
C:\Users\flori\Documents\BSc_Data-Science_FHNW\Module\CX5\venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
C:\Users\flori\Documents\BSc_Data-Science_FHNW\Module\CX5\venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
C:\Users\flori\Documents\BSc_Data-Science_FHNW\Module\CX5\venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
C:\Users\flori\Documents\BSc_Data-Science_FHNW\Module\CX5\venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
C:\Users\flori\Documents\BSc_Data-Science_FHNW\Module\CX5\venv\Lib\site-packages\umap\umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism. warn(
obove we can see the same for every coin by itself. There ist not specific indicate, that some clusters tend to be for scams or no scams. For Example the plots for Avalanche (no Scam) and Bitforex (scam) look similar.